######################
# Replication of previous study audit
# Purpose: This data loads the location and timing information from 38 studies fielded by the authors since 2013
# Note: Data has been scrubbed of IP Addresses, since these are counted as PID at some institutions.
# IP lookups were conducted using the IP Hub service through the "rIP" package described in the main paper.
# This is set up as an R Project in RStudio. You can load and run the code once the project ("Replication Files.rproj") has been opened by double-clicking or using the project menu in the upper-right-hand corner of RStudio
######################

library(tidyverse)
library(gridExtra)
# library(rIP) #This does not need to be run. It is included to show the process used for processing the raw files that included IP addresses.

# Set Working Directory to the Replication Files folder
setwd("./Replication Files")

# Load the studies
files <- list.files("./Data/Audit Studies")

# Function to load and process the raw study files with IP addresses.
# [This does not need to be run and has been commented out. It is included to show the coding process from the raw IP addresses.]
# processCSV <- function(files) {
#   keyData <- c()
#   for(i in 1:length(files)) {
#     tempdata <- read_csv(paste0("./Data/Old surveys (ipdata)/", files[i]))
#     tempdata <- tempdata %>%
#       filter(Finished == 1) %>%
#       select(StartDate, IPAddress)
#     ipdata <- getIPinfo(data.frame(tempdata), "IPAddress", iphubKey)
#     ipdata <- ipdata %>% distinct(ip, .keep_all = TRUE)
#     tempdata <- tempdata %>%
#       inner_join(ipdata, by = c("IPAddress" = "ip")) %>%
#       mutate(vpsStd = ifelse(block == 1, 1, 0),
#              vpsStrict = ifelse(block == 1 | block == 2, 1, 0),
#              ipForeign = ifelse(countryCode != "US", 1, 0),
#              shldBlock = ifelse(vpsStd == 1 | ipForeign == 1, 1, 0))
#     keyData <- keyData %>% bind_rows(tempdata)
#   }
#   return(keyData)
# }

#########################
# Loop to load all of the data together
#########################
audit_data <- tibble() # Create an empty tibble
for(i in files) { # Loop over all the files in the folder
  temp_data <- read_csv(paste0("./Data/Audit Studies/",i)) # Load each individual dataset
  audit_data <- audit_data %>% # bind the data together
    bind_rows(temp_data)
}

########################
# Create plots to show the chart results
########################

# Shorted name of Venezuela for chart
audit_data <- audit_data %>%
  mutate(countryName = recode(countryName, "Venezuela, Bolivarian Republic of" = "Venezuela"))

# Summarize the number of suspect responses by month
meanCheats <- audit_data %>% 
  mutate(monthyear = format(StartDate, "%Y-%m"),
         VPS = ifelse(block == 1, 1, 0),
         nonUS = ifelse(countryCode != "US", 1, 0)) %>%
  group_by(monthyear) %>%
  summarise(pctVPS = mean(VPS, na.rm = T),
            pctNUS = mean(nonUS, na.rm = T),
            totResp = n())

# Plot A, the mean number of suspected respondents by month
p1 <- ggplot(meanCheats) +
  geom_bar(aes(x = monthyear, y = totResp), stat = "identity") +
  xlab("Year-Month of Survey") + ylab("Number [R]s") + theme_bw() +
  ggtitle("(A)") +
  theme(axis.text.x = element_text(angle = 90, size = 8))
p1

# Plot B, the mean number, broken down by VPS and non-US over time
p2l <- meanCheats %>%
  gather(varName, proportion, pctVPS:pctNUS) %>%
  ggplot() +
  geom_line(aes(x = monthyear, y = proportion, group = varName, color = varName), size = 1.5) +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90, size = 8), legend.position = c(.22,.77)) +
  scale_color_discrete(name = "Type", labels = c("Non-U.S.", "VPS")) +
  ylab("Proportion [R]s") + xlab("Year-Month of Survey") +
  ggtitle("(B)")
p2l

# Plot C, the proportion of non-US respondents for countries with more than 4 respondents
p3 <- audit_data %>%
  filter(countryCode != "US" & block != 1) %>%
  count(countryName) %>%
  mutate(propCty = n/sum(n)) %>%
  filter(n > 4 & countryName != "Unknown") %>%
  ggplot() +
  geom_bar(aes(x = countryName, y = propCty), stat = "identity") + #coord_flip() +
  xlab("Country") + ylab("Proportion Non-U.S. [R]s") + theme_bw() +
  ggtitle("(C)") + 
  theme(axis.text.x = element_text(angle = 90, size = 8))
p3

# Create data with the number of resondents from Venezuela and India over time
timelineCheats <- audit_data %>% 
  mutate(year = format(StartDate, "%Y")) %>%
  filter(countryName == "Venezuela" |
           countryName == "India") %>%
  group_by(year, countryName) %>%
  summarize(sumCheats = n())

# Plot D, number of respondents on US surveys that were actually from India or Venezuela
p4 <- timelineCheats %>%
  ggplot() +
  geom_line(aes(x = as.numeric(year), y = sumCheats, color = countryName), size = 1.5) +
  theme_bw() +
  scale_color_manual(name = "Country", labels = c("India","Venezuela"), 
                     values = c("#E69F00", "darkgreen")) +
  xlab("Year") + ylab("Number [R]s") +
  ggtitle("(D)") +
  theme(legend.position = c(.3,.75))
p4

# Collect plots into grid for Figure 1 [Will need to adjust size to accomodate legends]
g <- grid.arrange(p1,p2l,p3,p4, ncol = 2)

# Write the plot to disk
ggsave("./Output Figures/timeseriesPlot.pdf", g, device = "pdf", height = 6, width = 8)

###########################################
# Crosscheck IP Hub vs. Know Your IP
###########################################
# Load results from IP Hub, IP Void, and AbuseIPDB, and IP Hub data combined
# Note: since merging was done on IP Addresses, full merge data has been omitted
crosscheck <- read_csv("./Data/Comparison Data/crosscheck.csv")

# Because IP addresses are omitted, the code below does not need to be run and is commented.
# It is included to demonstrate how the variables used for the checks were created.
# crosscheck <- iphubCheck %>%
#   inner_join(ipvoid, by = "IPAddress") %>%
#   inner_join(abuseipdb, by = "IPAddress") %>%
#   mutate(iphubUS = ifelse(countryCode != "US", 1, 0),
#          iphubVPS = ifelse(block == 1, 1, 0),
#          ipvoidUS = ifelse(ipvoid.country_code != "(US) United States", 1, 0),
#          ipvoidVPS = ifelse(ipvoid.blacklist_status != "POSSIBLY SAFE 0/96", 1, 0),
#          abuseipdbUS = ifelse(abuseipdb.country != "\r\n\r\nUnited States\r\n", 1, 0),
#          abuseipdbVPS = ifelse(abuseipdb.usage_type == "\r\nData Center/Web Hosting/Transit\r\n", 1, 0)) %>%
#   mutate(iphubExclude = ifelse(iphubUS == 1 | iphubVPS == 1, "iphubBlock", "iphubSafe"),
#          ipvoidExclude = ifelse(ipvoidUS == 1 | ipvoidVPS == 1, "ipvoidBlock", "ipvoidSafe"),
#          abuseipdbExclude = ifelse(abuseipdbUS == 1 | abuseipdbVPS == 1, "abuseipdbBlock", "abuseipdbSafe"))

# Comparison of coding for US IPs
table(crosscheck$iphubUS, crosscheck$ipvoidUS)
table(crosscheck$iphubUS, crosscheck$abuseipdbUS)

# Comparison of coding for VPS use
table(crosscheck$iphubVPS, crosscheck$ipvoidVPS)
table(crosscheck$iphubVPS, crosscheck$abuseipdbVPS)

# Table 1 results for all exclusion criteria
table(crosscheck$iphubExclude, crosscheck$ipvoidExclude)
table(crosscheck$iphubExclude, crosscheck$abuseipdbExclude)

#################################
# Check of timing of completion to see if this is an issue of a few users taking the survey many times
# Figures in Section A7 of Appendix
#################################

# Load needed libraries
library(haven)
library(tidyverse)
library(lubridate)
library(ggalt)


# Load data from teh two experiments
retro1 <- read_dta("./Data/retrostudy1 data.dta")
retro2 <- read_dta("./Data/retrostudy2 data.dta") 

# Convert first experiment to only include those who finished the survey and were deemed suspect by their IP address and format the dates for the plot
retro1 <- retro1 %>%
  filter(Finished == 1 & (block == 1 | countrycode != "US")) %>%
  mutate(start = ymd_hms(StartDate),
         end = ymd_hms(EndDate))

# Plot the start and finish times by ISP provider for suspect IP Addresses
tp1 <- ggplot(retro1) + 
  geom_dumbbell(aes(x = start, xend = end, y = as.factor(isp)), 
                colour_x = "grey", colour_xend = "black") +
  theme_bw() + labs(x = "Time of Start/Finish", y = "ISP", title = "Experiment 1")
tp1

# Write plot to disk
ggsave("./Output Figures/timingE1.pdf", tp1, device = "pdf", height = 8, width = 6)

# Same data converstion for experiment 2
retro2 <- retro2 %>%
  filter(Finished == 1 & shldblock == 1) %>%
  mutate(start = ymd_hms(StartDate),
         end = ymd_hms(EndDate))

# Plot for experiment 2
tp2 <- ggplot(retro2) + 
  geom_dumbbell(aes(x = start, xend = end, y = as.factor(isp)), 
                colour_x = "grey", colour_xend = "black") +
  theme_bw() + labs(x = "Time of Start/Finish", y = "ISP", title = "Experiment 2")
tp2

# Write plot to disk
ggsave("./Output Figures/timingE2.pdf", tp2, device = "pdf", height = 10, width = 8)

